In [1]:
#installing the necessary packages if not pre-installed
#pip install pandas
#pip install plotly
#pip install seaborn
#pip install matplotlib
#pip install numpy
#!pip install nbformat
#!pip install chart_studio
#!pip install matplotlib-colorbar
#!pip install category_encoders
In [2]:
#importing the necessary libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from matplotlib.colors import LinearSegmentedColormap
import matplotlib.patches as patches
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
In [3]:
#importing the e-commerce dataset into a dataframe
ecd = pd.read_csv('C:\\Users\\rutvi\\OneDrive\\Desktop\\SEM 3\\DAB 303 Marketing Analytics\\E-Commerce Churn Data.csv')
ecd
Out[3]:
CustomerID Churn Tenure PreferredLoginDevice CityTier WarehouseToHome PreferredPaymentMode Gender HourSpendOnApp NumberOfDeviceRegistered PreferedOrderCat SatisfactionScore MaritalStatus NumberOfAddress Complain OrderAmountHikeFromlastYear CouponUsed OrderCount DaySinceLastOrder CashbackAmount
0 50001 1 4.0 Mobile Phone 3 6.0 Debit Card Female 3.0 3 Laptop & Accessory 2 Single 9 1 11.0 1.0 1.0 5.0 160
1 50002 1 NaN Phone 1 8.0 UPI Male 3.0 4 Mobile 3 Single 7 1 15.0 0.0 1.0 0.0 121
2 50003 1 NaN Phone 1 30.0 Debit Card Male 2.0 4 Mobile 3 Single 6 1 14.0 0.0 1.0 3.0 120
3 50004 1 0.0 Phone 3 15.0 Debit Card Male 2.0 4 Laptop & Accessory 5 Single 8 0 23.0 0.0 1.0 3.0 134
4 50005 1 0.0 Phone 1 12.0 CC Male NaN 3 Mobile 5 Single 3 0 11.0 1.0 1.0 3.0 130
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
5625 55626 0 10.0 Computer 1 30.0 Credit Card Male 3.0 2 Laptop & Accessory 1 Married 6 0 18.0 1.0 2.0 4.0 151
5626 55627 0 13.0 Mobile Phone 1 13.0 Credit Card Male 3.0 5 Fashion 5 Married 6 0 16.0 1.0 2.0 NaN 225
5627 55628 0 1.0 Mobile Phone 1 11.0 Debit Card Male 3.0 2 Laptop & Accessory 4 Married 3 1 21.0 1.0 2.0 4.0 186
5628 55629 0 23.0 Computer 3 9.0 Credit Card Male 4.0 5 Laptop & Accessory 4 Married 4 0 15.0 2.0 2.0 9.0 179
5629 55630 0 8.0 Mobile Phone 1 15.0 Credit Card Male 3.0 2 Laptop & Accessory 3 Married 4 0 13.0 2.0 2.0 3.0 169

5630 rows × 20 columns

In [4]:
#checking the column names
ecd.columns
Out[4]:
Index(['CustomerID', 'Churn', 'Tenure', 'PreferredLoginDevice', 'CityTier',
       'WarehouseToHome', 'PreferredPaymentMode', 'Gender', 'HourSpendOnApp',
       'NumberOfDeviceRegistered', 'PreferedOrderCat', 'SatisfactionScore',
       'MaritalStatus', 'NumberOfAddress', 'Complain',
       'OrderAmountHikeFromlastYear', 'CouponUsed', 'OrderCount',
       'DaySinceLastOrder', 'CashbackAmount'],
      dtype='object')
In [5]:
#checking the description of dataset
pd.options.display.float_format = '{:20,.2f}'.format
ecd.describe().T
Out[5]:
count mean std min 25% 50% 75% max
CustomerID 5,630.00 52,815.50 1,625.39 50,001.00 51,408.25 52,815.50 54,222.75 55,630.00
Churn 5,630.00 0.17 0.37 0.00 0.00 0.00 0.00 1.00
Tenure 5,366.00 10.19 8.56 0.00 2.00 9.00 16.00 61.00
CityTier 5,630.00 1.65 0.92 1.00 1.00 1.00 3.00 3.00
WarehouseToHome 5,379.00 15.64 8.53 5.00 9.00 14.00 20.00 127.00
HourSpendOnApp 5,375.00 2.93 0.72 0.00 2.00 3.00 3.00 5.00
NumberOfDeviceRegistered 5,630.00 3.69 1.02 1.00 3.00 4.00 4.00 6.00
SatisfactionScore 5,630.00 3.07 1.38 1.00 2.00 3.00 4.00 5.00
NumberOfAddress 5,630.00 4.21 2.58 1.00 2.00 3.00 6.00 22.00
Complain 5,630.00 0.28 0.45 0.00 0.00 0.00 1.00 1.00
OrderAmountHikeFromlastYear 5,365.00 15.71 3.68 11.00 13.00 15.00 18.00 26.00
CouponUsed 5,374.00 1.75 1.89 0.00 1.00 1.00 2.00 16.00
OrderCount 5,372.00 3.01 2.94 1.00 1.00 2.00 3.00 16.00
DaySinceLastOrder 5,323.00 4.54 3.65 0.00 2.00 3.00 7.00 46.00
CashbackAmount 5,630.00 177.22 49.19 0.00 146.00 163.00 196.00 325.00
In [6]:
#checking the first 10 records of the dataset
ecd.head(10)
Out[6]:
CustomerID Churn Tenure PreferredLoginDevice CityTier WarehouseToHome PreferredPaymentMode Gender HourSpendOnApp NumberOfDeviceRegistered PreferedOrderCat SatisfactionScore MaritalStatus NumberOfAddress Complain OrderAmountHikeFromlastYear CouponUsed OrderCount DaySinceLastOrder CashbackAmount
0 50001 1 4.00 Mobile Phone 3 6.00 Debit Card Female 3.00 3 Laptop & Accessory 2 Single 9 1 11.00 1.00 1.00 5.00 160
1 50002 1 NaN Phone 1 8.00 UPI Male 3.00 4 Mobile 3 Single 7 1 15.00 0.00 1.00 0.00 121
2 50003 1 NaN Phone 1 30.00 Debit Card Male 2.00 4 Mobile 3 Single 6 1 14.00 0.00 1.00 3.00 120
3 50004 1 0.00 Phone 3 15.00 Debit Card Male 2.00 4 Laptop & Accessory 5 Single 8 0 23.00 0.00 1.00 3.00 134
4 50005 1 0.00 Phone 1 12.00 CC Male NaN 3 Mobile 5 Single 3 0 11.00 1.00 1.00 3.00 130
5 50006 1 0.00 Computer 1 22.00 Debit Card Female 3.00 5 Mobile Phone 5 Single 2 1 22.00 4.00 6.00 7.00 139
6 50007 1 NaN Phone 3 11.00 Cash on Delivery Male 2.00 3 Laptop & Accessory 2 Divorced 4 0 14.00 0.00 1.00 0.00 121
7 50008 1 NaN Phone 1 6.00 CC Male 3.00 3 Mobile 2 Divorced 3 1 16.00 2.00 2.00 0.00 123
8 50009 1 13.00 Phone 3 9.00 E wallet Male NaN 4 Mobile 3 Divorced 2 1 14.00 0.00 1.00 2.00 127
9 50010 1 NaN Phone 1 31.00 Debit Card Male 2.00 5 Mobile 3 Single 2 0 12.00 1.00 1.00 1.00 123
In [7]:
#checking the shape of our dataset
ecd.shape
Out[7]:
(5630, 20)
In [8]:
#checking the dataset information
ecd.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5630 entries, 0 to 5629
Data columns (total 20 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   CustomerID                   5630 non-null   int64  
 1   Churn                        5630 non-null   int64  
 2   Tenure                       5366 non-null   float64
 3   PreferredLoginDevice         5630 non-null   object 
 4   CityTier                     5630 non-null   int64  
 5   WarehouseToHome              5379 non-null   float64
 6   PreferredPaymentMode         5630 non-null   object 
 7   Gender                       5630 non-null   object 
 8   HourSpendOnApp               5375 non-null   float64
 9   NumberOfDeviceRegistered     5630 non-null   int64  
 10  PreferedOrderCat             5630 non-null   object 
 11  SatisfactionScore            5630 non-null   int64  
 12  MaritalStatus                5630 non-null   object 
 13  NumberOfAddress              5630 non-null   int64  
 14  Complain                     5630 non-null   int64  
 15  OrderAmountHikeFromlastYear  5365 non-null   float64
 16  CouponUsed                   5374 non-null   float64
 17  OrderCount                   5372 non-null   float64
 18  DaySinceLastOrder            5323 non-null   float64
 19  CashbackAmount               5630 non-null   int64  
dtypes: float64(7), int64(8), object(5)
memory usage: 879.8+ KB
In [9]:
#Detectung missing values
ecd.isnull().any()
Out[9]:
CustomerID                     False
Churn                          False
Tenure                          True
PreferredLoginDevice           False
CityTier                       False
WarehouseToHome                 True
PreferredPaymentMode           False
Gender                         False
HourSpendOnApp                  True
NumberOfDeviceRegistered       False
PreferedOrderCat               False
SatisfactionScore              False
MaritalStatus                  False
NumberOfAddress                False
Complain                       False
OrderAmountHikeFromlastYear     True
CouponUsed                      True
OrderCount                      True
DaySinceLastOrder               True
CashbackAmount                 False
dtype: bool
In [10]:
#counting the missing values
ecd.isna().sum()
Out[10]:
CustomerID                       0
Churn                            0
Tenure                         264
PreferredLoginDevice             0
CityTier                         0
WarehouseToHome                251
PreferredPaymentMode             0
Gender                           0
HourSpendOnApp                 255
NumberOfDeviceRegistered         0
PreferedOrderCat                 0
SatisfactionScore                0
MaritalStatus                    0
NumberOfAddress                  0
Complain                         0
OrderAmountHikeFromlastYear    265
CouponUsed                     256
OrderCount                     258
DaySinceLastOrder              307
CashbackAmount                   0
dtype: int64
In [11]:
# Plot missing values
ecd.isna().sum()[ecd.isna().sum() > 0].plot(kind='bar', color=plt.cm.Paired.colors)
plt.title("Missing Values")
plt.xticks(rotation=45)
plt.show()
No description has been provided for this image
In [12]:
#total number of missing values in our dataset
ecd.isna().sum().sum()
Out[12]:
1856
In [13]:
#creating a function to display the data type, percentage of missing values and number of unique values per column
def sniff_modified(df):
    with pd.option_context("display.max_colwidth", 20):
        info = pd.DataFrame()
        info['data type'] = df.dtypes
        info['percent missing'] = df.isnull().sum()*100/len(df)
        info['No. unique'] = df.apply(lambda x: len(x.unique()))
        info['unique values'] = df.apply(lambda x: x.unique())
        return info.sort_values('data type')
In [14]:
sniff_modified(ecd)
Out[14]:
data type percent missing No. unique unique values
CustomerID int64 0.00 5630 [50001, 50002, 50003, 50004, 50005, 50006, 500...
Complain int64 0.00 2 [1, 0]
NumberOfAddress int64 0.00 15 [9, 7, 6, 8, 3, 2, 4, 10, 1, 5, 19, 21, 11, 20...
SatisfactionScore int64 0.00 5 [2, 3, 5, 4, 1]
NumberOfDeviceRegistered int64 0.00 6 [3, 4, 5, 2, 1, 6]
Churn int64 0.00 2 [1, 0]
CityTier int64 0.00 3 [3, 1, 2]
CashbackAmount int64 0.00 220 [160, 121, 120, 134, 130, 139, 123, 127, 295, ...
WarehouseToHome float64 4.46 35 [6.0, 8.0, 30.0, 15.0, 12.0, 22.0, 11.0, 9.0, ...
HourSpendOnApp float64 4.53 7 [3.0, 2.0, nan, 1.0, 0.0, 4.0, 5.0]
DaySinceLastOrder float64 5.45 23 [5.0, 0.0, 3.0, 7.0, 2.0, 1.0, 8.0, 6.0, 4.0, ...
Tenure float64 4.69 37 [4.0, nan, 0.0, 13.0, 11.0, 9.0, 19.0, 20.0, 1...
OrderAmountHikeFromlastYear float64 4.71 17 [11.0, 15.0, 14.0, 23.0, 22.0, 16.0, 12.0, nan...
CouponUsed float64 4.55 18 [1.0, 0.0, 4.0, 2.0, 9.0, 6.0, 11.0, nan, 7.0,...
OrderCount float64 4.58 17 [1.0, 6.0, 2.0, 15.0, 4.0, 7.0, 3.0, 9.0, nan,...
PreferredPaymentMode object 0.00 7 [Debit Card, UPI, CC, Cash on Delivery, E wall...
Gender object 0.00 2 [Female, Male]
PreferedOrderCat object 0.00 6 [Laptop & Accessory, Mobile, Mobile Phone, Oth...
PreferredLoginDevice object 0.00 3 [Mobile Phone, Phone, Computer]
MaritalStatus object 0.00 3 [Single, Divorced, Married]
In [15]:
#creating a function to find the columns with missing values, extract the number and percentage of these missing values in relation to the dataset

def FindMissingColsPercentage(df):

    total = 0
    for col in df.columns:
        missing_vals = df[col].isnull().sum()
        # mean = sum / total
        pct = df[col].isnull().mean() * 100 
        if missing_vals != 0:
          print('{} => {} [{}%]'.format(col, df[col].isnull().sum(), round(pct, 2)))
        total += missing_vals
    
    if total == 0:
        print("no missing values")
In [16]:
FindMissingColsPercentage(ecd)
Tenure => 264 [4.69%]
WarehouseToHome => 251 [4.46%]
HourSpendOnApp => 255 [4.53%]
OrderAmountHikeFromlastYear => 265 [4.71%]
CouponUsed => 256 [4.55%]
OrderCount => 258 [4.58%]
DaySinceLastOrder => 307 [5.45%]
In [17]:
#replacing all empty spaces with np.NaN
ecd_clean = ecd.replace(" ", np.NaN)
In [18]:
# replacing all missing values(NaN) in the dataset with 0
ecd_clean = ecd_clean.fillna(0)
In [19]:
#checking dataset information after replacing the missing values
ecd_clean.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5630 entries, 0 to 5629
Data columns (total 20 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   CustomerID                   5630 non-null   int64  
 1   Churn                        5630 non-null   int64  
 2   Tenure                       5630 non-null   float64
 3   PreferredLoginDevice         5630 non-null   object 
 4   CityTier                     5630 non-null   int64  
 5   WarehouseToHome              5630 non-null   float64
 6   PreferredPaymentMode         5630 non-null   object 
 7   Gender                       5630 non-null   object 
 8   HourSpendOnApp               5630 non-null   float64
 9   NumberOfDeviceRegistered     5630 non-null   int64  
 10  PreferedOrderCat             5630 non-null   object 
 11  SatisfactionScore            5630 non-null   int64  
 12  MaritalStatus                5630 non-null   object 
 13  NumberOfAddress              5630 non-null   int64  
 14  Complain                     5630 non-null   int64  
 15  OrderAmountHikeFromlastYear  5630 non-null   float64
 16  CouponUsed                   5630 non-null   float64
 17  OrderCount                   5630 non-null   float64
 18  DaySinceLastOrder            5630 non-null   float64
 19  CashbackAmount               5630 non-null   int64  
dtypes: float64(7), int64(8), object(5)
memory usage: 879.8+ KB
In [20]:
ecd_clean.isnull().any()
Out[20]:
CustomerID                     False
Churn                          False
Tenure                         False
PreferredLoginDevice           False
CityTier                       False
WarehouseToHome                False
PreferredPaymentMode           False
Gender                         False
HourSpendOnApp                 False
NumberOfDeviceRegistered       False
PreferedOrderCat               False
SatisfactionScore              False
MaritalStatus                  False
NumberOfAddress                False
Complain                       False
OrderAmountHikeFromlastYear    False
CouponUsed                     False
OrderCount                     False
DaySinceLastOrder              False
CashbackAmount                 False
dtype: bool
In [21]:
FindMissingColsPercentage(ecd_clean)
no missing values
In [22]:
#viewing the object data type columns
ecd_clean[['PreferredPaymentMode', 'Gender', 'PreferedOrderCat', 'PreferredLoginDevice', 'MaritalStatus']]
Out[22]:
PreferredPaymentMode Gender PreferedOrderCat PreferredLoginDevice MaritalStatus
0 Debit Card Female Laptop & Accessory Mobile Phone Single
1 UPI Male Mobile Phone Single
2 Debit Card Male Mobile Phone Single
3 Debit Card Male Laptop & Accessory Phone Single
4 CC Male Mobile Phone Single
... ... ... ... ... ...
5625 Credit Card Male Laptop & Accessory Computer Married
5626 Credit Card Male Fashion Mobile Phone Married
5627 Debit Card Male Laptop & Accessory Mobile Phone Married
5628 Credit Card Male Laptop & Accessory Computer Married
5629 Credit Card Male Laptop & Accessory Mobile Phone Married

5630 rows × 5 columns

In [23]:
#checking the unique values in these columns
obj = ['PreferredPaymentMode', 'Gender', 'PreferedOrderCat', 'PreferredLoginDevice', 'MaritalStatus']  #creating a list to the column names

for i in obj: #creating a for loop to print out the column name and unique values and count
    #print(i, ecd_clean[i].nunique(), '\n',ecd_clean[i].unique())
    print(ecd_clean[i].value_counts(),'\n')
PreferredPaymentMode
Debit Card          2314
Credit Card         1501
E wallet             614
UPI                  414
COD                  365
CC                   273
Cash on Delivery     149
Name: count, dtype: int64 

Gender
Male      3384
Female    2246
Name: count, dtype: int64 

PreferedOrderCat
Laptop & Accessory    2050
Mobile Phone          1271
Fashion                826
Mobile                 809
Grocery                410
Others                 264
Name: count, dtype: int64 

PreferredLoginDevice
Mobile Phone    2765
Computer        1634
Phone           1231
Name: count, dtype: int64 

MaritalStatus
Married     2986
Single      1796
Divorced     848
Name: count, dtype: int64 

Following a review of the unique values of these columns shown above, we can assume the following:

that 'CC' and 'COD' is the same as 'credit card' and 'Cash on Delivery' respectively under the PreferredPaymentMode column 'Mobile' Phone and 'Phone' connotes the same meaning for the records in the PreferredLoginDevice column. 'Mobile Phone' and 'Mobile' is the same for records in the PreferedOrderCat column. Based on these, we will proceed to further clean the data by replacing the following

'CC' with 'Credit Card' 'COD' with 'Cash on Delivery' 'Phone' with 'Mobile Phone', and 'Mobile' with 'Mobile Phone'.

In [24]:
#replacing the data entries that has the same meaning
ecd_clean['PreferredPaymentMode'] = ecd_clean['PreferredPaymentMode'].replace('CC', 'Credit Card')
ecd_clean['PreferredPaymentMode'] = ecd_clean['PreferredPaymentMode'].replace('COD', 'Cash on Delivery')
ecd_clean['PreferedOrderCat'] = ecd_clean['PreferedOrderCat'].replace('Mobile', 'Mobile Phone')
ecd_clean['PreferredLoginDevice'] = ecd_clean['PreferredLoginDevice'].replace('Phone', 'Mobile Phone')
In [25]:
#cross-checking the replacement has been effected
for i in obj: #creating a for loop to print out the column name and unique values and count
    print(ecd_clean[i].value_counts(),'\n')
PreferredPaymentMode
Debit Card          2314
Credit Card         1774
E wallet             614
Cash on Delivery     514
UPI                  414
Name: count, dtype: int64 

Gender
Male      3384
Female    2246
Name: count, dtype: int64 

PreferedOrderCat
Mobile Phone          2080
Laptop & Accessory    2050
Fashion                826
Grocery                410
Others                 264
Name: count, dtype: int64 

PreferredLoginDevice
Mobile Phone    3996
Computer        1634
Name: count, dtype: int64 

MaritalStatus
Married     2986
Single      1796
Divorced     848
Name: count, dtype: int64 

In [26]:
#creating categories from tenure column into a new column - tenuregroup
ranges = [0,10,20,30,40,50,60,np.inf] #list to hold the bin ranges
group_names = ['0-10 years', '11-20 years', '21-30 years', '31-40 years', '41-50 years', '51-60 years', '61 years & above'] # list to hold the labels
ecd_clean['TenureGroup'] = pd.cut(ecd_clean['Tenure'], bins = ranges, labels = group_names, include_lowest = True)
ecd_clean[['Tenure', 'TenureGroup']]
Out[26]:
Tenure TenureGroup
0 4.00 0-10 years
1 0.00 0-10 years
2 0.00 0-10 years
3 0.00 0-10 years
4 0.00 0-10 years
... ... ...
5625 10.00 0-10 years
5626 13.00 11-20 years
5627 1.00 0-10 years
5628 23.00 21-30 years
5629 8.00 0-10 years

5630 rows × 2 columns

In [27]:
#checking the final clean data
ecd_clean.head(20)
Out[27]:
CustomerID Churn Tenure PreferredLoginDevice CityTier WarehouseToHome PreferredPaymentMode Gender HourSpendOnApp NumberOfDeviceRegistered ... SatisfactionScore MaritalStatus NumberOfAddress Complain OrderAmountHikeFromlastYear CouponUsed OrderCount DaySinceLastOrder CashbackAmount TenureGroup
0 50001 1 4.00 Mobile Phone 3 6.00 Debit Card Female 3.00 3 ... 2 Single 9 1 11.00 1.00 1.00 5.00 160 0-10 years
1 50002 1 0.00 Mobile Phone 1 8.00 UPI Male 3.00 4 ... 3 Single 7 1 15.00 0.00 1.00 0.00 121 0-10 years
2 50003 1 0.00 Mobile Phone 1 30.00 Debit Card Male 2.00 4 ... 3 Single 6 1 14.00 0.00 1.00 3.00 120 0-10 years
3 50004 1 0.00 Mobile Phone 3 15.00 Debit Card Male 2.00 4 ... 5 Single 8 0 23.00 0.00 1.00 3.00 134 0-10 years
4 50005 1 0.00 Mobile Phone 1 12.00 Credit Card Male 0.00 3 ... 5 Single 3 0 11.00 1.00 1.00 3.00 130 0-10 years
5 50006 1 0.00 Computer 1 22.00 Debit Card Female 3.00 5 ... 5 Single 2 1 22.00 4.00 6.00 7.00 139 0-10 years
6 50007 1 0.00 Mobile Phone 3 11.00 Cash on Delivery Male 2.00 3 ... 2 Divorced 4 0 14.00 0.00 1.00 0.00 121 0-10 years
7 50008 1 0.00 Mobile Phone 1 6.00 Credit Card Male 3.00 3 ... 2 Divorced 3 1 16.00 2.00 2.00 0.00 123 0-10 years
8 50009 1 13.00 Mobile Phone 3 9.00 E wallet Male 0.00 4 ... 3 Divorced 2 1 14.00 0.00 1.00 2.00 127 11-20 years
9 50010 1 0.00 Mobile Phone 1 31.00 Debit Card Male 2.00 5 ... 3 Single 2 0 12.00 1.00 1.00 1.00 123 0-10 years
10 50011 1 4.00 Mobile Phone 1 18.00 Cash on Delivery Female 2.00 3 ... 3 Divorced 2 0 0.00 9.00 15.00 8.00 295 0-10 years
11 50012 1 11.00 Mobile Phone 1 6.00 Debit Card Male 3.00 4 ... 3 Single 10 1 13.00 0.00 1.00 0.00 154 11-20 years
12 50013 1 0.00 Mobile Phone 1 11.00 Cash on Delivery Male 2.00 3 ... 3 Single 2 1 13.00 2.00 2.00 2.00 134 0-10 years
13 50014 1 0.00 Mobile Phone 1 15.00 Credit Card Male 3.00 4 ... 3 Divorced 1 1 17.00 0.00 1.00 0.00 134 0-10 years
14 50015 1 9.00 Mobile Phone 3 15.00 Credit Card Male 3.00 4 ... 2 Single 2 0 16.00 0.00 4.00 7.00 196 0-10 years
15 50016 1 0.00 Mobile Phone 2 12.00 UPI Male 3.00 3 ... 5 Married 5 1 22.00 1.00 1.00 2.00 121 0-10 years
16 50017 1 0.00 Computer 1 12.00 Debit Card Female 0.00 4 ... 2 Single 2 1 18.00 1.00 1.00 0.00 129 0-10 years
17 50018 1 0.00 Mobile Phone 3 11.00 E wallet Male 2.00 4 ... 3 Single 2 1 11.00 1.00 1.00 3.00 157 0-10 years
18 50019 1 0.00 Computer 1 13.00 Debit Card Male 3.00 5 ... 3 Single 2 1 24.00 1.00 1.00 6.00 161 0-10 years
19 50020 1 19.00 Mobile Phone 1 20.00 Debit Card Female 3.00 3 ... 4 Divorced 10 1 18.00 1.00 4.00 3.00 150 11-20 years

20 rows × 21 columns

In [28]:
# Plotting graphs for better understanding of data distribution
cols = ['HourSpendOnApp', 'NumberOfDeviceRegistered', 'SatisfactionScore', 'OrderAmountHikeFromlastYear', 'CouponUsed', 'DaySinceLastOrder']
fig, axes = plt.subplots(3, 2, figsize=(15, 15))
for col, ax in zip(cols, axes.flatten()):
    ecd[col].value_counts().sort_index().plot(kind='line', ax=ax, title=col, color='purple')
    ax.set(xlabel='Values', ylabel='Frequency')

plt.tight_layout()
plt.show()
No description has been provided for this image
In [29]:
import seaborn as sns
import matplotlib.pyplot as plt
ecd_1 =  ecd_clean.copy() #creating a copy of the cleaned dataset
# Replace churn values
ecd_1['Churn'] = ecd_1['Churn'].replace({0: "Customer Retained", 1: "Customer Churned"})

# Create the bar plot
sns.barplot(x='Churn', y='HourSpendOnApp', data=ecd_1, palette='viridis')
plt.title("Churn Rate", fontsize=16)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [30]:
import matplotlib.pyplot as plt

# Count churned and retained customers
Cust_Churn_Count = (ecd_clean['Churn'] == 1).sum()
Cust_Retained_count = (ecd_clean['Churn'] == 0).sum()

# Define labels and sizes
labels = ['Customers Churned', 'Customers Retained']
sizes = [Cust_Churn_Count, Cust_Retained_count]

# New color scheme (shades of blue and purple)
colors_given = ['#4B0082', '#4682B4']  # Indigo & Steel Blue

# Define the explode parameter (pulling apart slices)
explode = (0.1, 0)  # Pull apart 'Customers Churned' slice

fig, ax = plt.subplots()

# Create the pie chart
ax.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90, 
       wedgeprops={'edgecolor': 'black'}, colors=colors_given, explode=explode)
ax.axis('equal')  # Keep the chart as a circle

# Add legend
ax.legend(labels, loc="center left", bbox_to_anchor=(1.1, 1), edgecolor='black')

# Set title
plt.title('Customer Churn Analysis', fontweight='bold')
plt.show()
No description has been provided for this image
In [31]:
#visualising the percentage of customers that churned
ecd_clean['Churn'].value_counts().plot.pie(labels=['Customers who stayed', 'Customers who left'],
                                           autopct='%1.1f%%',shadow=True, colors=sns.color_palette('Purples'))
plt.title("% of Customer's that churned")
plt.show()
No description has been provided for this image
In [32]:
#checking out churn rate by gender distribution using a pivot table 
ecd_clean.pivot_table(values="Tenure", index="Gender", columns="Churn",
                      aggfunc='sum')
Out[32]:
Churn 0 1
Gender
Female 22,011.00 946.00
Male 29,738.00 1,984.00
In [33]:
import seaborn as sns
import matplotlib.pyplot as plt

# Creating the histogram
plt.figure(figsize=(8, 5))
sns.histplot(data=ecd_1, x='Churn', hue='Gender', hue_order=['Male', 'Female'], 
             multiple='dodge', palette='coolwarm', shrink=0.8)

# Title and labels
plt.title("Churn Rate by Gender")
plt.xlabel("Churn Status")
plt.ylabel("Count")
plt.xticks(ticks=[0, 1], labels=['Customer Retained', 'Customer Churned'])

plt.show()
No description has been provided for this image
In [34]:
tenure_counts = ecd_clean['TenureGroup'].value_counts().reset_index()
tenure_counts.columns = ['TenureGroup', 'Count']
sns.barplot(x='TenureGroup', y='Count', data=tenure_counts, palette='coolwarm')
# Rotate x-axis labels for readability
plt.xticks(rotation=90)
plt.title("Customer Distribution by Tenure Group")
plt.xlabel("Tenure Group")
plt.ylabel("Count")
Out[34]:
Text(0, 0.5, 'Count')
No description has been provided for this image
In [35]:
# Creating the bar plot
plt.figure(figsize=(10, 5))
sns.barplot(x='TenureGroup', y='Churn', data=ecd_clean, palette='coolwarm')
# Formatting the plot
plt.ylabel('Churn Rate')
plt.xlabel('Tenure Group')
plt.xticks(rotation=90)
plt.title("Churn Rate by Tenure Group")
plt.show()
No description has been provided for this image
In [36]:
sns.set_context("paper", font_scale=1.1)
# Create the histogram
plt.figure(figsize=(10, 5))
sns.histplot(ecd_clean[ecd_clean["Churn"] == 0]["DaySinceLastOrder"], 
             color="blue", label="Customers Retained", kde=False, bins=30, alpha=0.7)
sns.histplot(ecd_clean[ecd_clean["Churn"] == 1]["DaySinceLastOrder"], 
             color="red", label="Customers Churned", kde=False, bins=30, alpha=0.7)
# Formatting
plt.legend(loc='upper right')
plt.ylabel('Count')
plt.xlabel('Days Since Last Order')
plt.title('Churn Analysis based on Days Since Last Order')
plt.show()
No description has been provided for this image
In [37]:
# Define a custom color palette
custom_palette = ['#FF6347', '#4682B4', '#32CD32']  # Example colors: tomato, steelblue, limegreen

sns.lineplot(
    data=ecd, x='OrderCount',  y="Churn", 
    hue="CityTier", ci=None,
    palette=custom_palette
)
Out[37]:
<Axes: xlabel='OrderCount', ylabel='Churn'>
No description has been provided for this image
In [38]:
import seaborn as sns
import matplotlib.pyplot as plt

# Define a custom color palette for 'Churn' values
custom_palette = ['#FF6347', '#4682B4']  # Red for 'Customer Churned', Blue for 'Customer Retained'

order = ['Customer Retained', 'Customer Churned']

# Create subplots
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Plot with custom palette and bar chart
sns.countplot(data=ecd_1, x='Complain', hue='Churn', ax=axes[0], hue_order=order, palette=custom_palette)
sns.countplot(data=ecd_1, x='SatisfactionScore', hue='Churn', ax=axes[1], hue_order=order, palette=custom_palette)

plt.tight_layout()
plt.show()
No description has been provided for this image
In [39]:
# Define custom color palette
custom_palette = ['#FF6347', '#4682B4']

fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Countplots with custom color palette
sns.countplot(data=ecd_1, x='CityTier', hue='Churn', ax=axes[0], hue_order=order, palette=custom_palette)
sns.countplot(data=ecd_1, x='PreferedOrderCat', hue='Churn', ax=axes[1], 
              order=['Laptop & Accessory', 'Mobile Phone', 'Fashion', 'Grocey', 'Others'], 
              hue_order=order, palette=custom_palette)

plt.tight_layout()
plt.show()
No description has been provided for this image
In [40]:
# Define custom color palette
custom_palette = ['#FF6347', '#4682B4']

fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Side-by-side bar plots using histplot with dodge=True
sns.histplot(data=ecd_1, x='HourSpendOnApp', hue='Churn', ax=axes[0], hue_order=order, 
             multiple='dodge', palette=custom_palette, discrete=True)

sns.histplot(data=ecd_1, x='NumberOfDeviceRegistered', hue='Churn', ax=axes[1], 
             hue_order=order, multiple='dodge', palette=custom_palette, discrete=True)

plt.tight_layout()
plt.show()
No description has been provided for this image
In [41]:
#subsetting the dataset to determine the churn characteristics of customer that registered a complaint with the company
count_of_complaints=ecd_clean.groupby(['Complain','Churn']).apply(lambda x:x['Churn'].count()).reset_index(name='No. of Customers')
count_of_complaints 
Out[41]:
Complain Churn No. of Customers
0 0 0 3586
1 0 1 440
2 1 0 1096
3 1 1 508
In [42]:
import matplotlib.pyplot as plt

# Data
complain_values = [0, 1]
churn_0 = [3586, 1096]  # Customer Retained
churn_1 = [440, 508]    # Customer Churned

complain_labels = ['No Complaint', 'Complaint']
bar_width = 0.30

# Positioning bars
r1 = range(len(complain_values))
r2 = [x + bar_width for x in r1]

# Create figure and axis
fig, ax = plt.subplots(figsize=(10, 6))

# Change bar colors
plt.bar(r1, churn_0, color='#3CB371', width=bar_width, edgecolor='black', label='Customers Stayed')  # Green
plt.bar(r2, churn_1, color='#FF8C00', width=bar_width, edgecolor='black', label='Customers Left')  # Orange

# Add value labels
for i in range(len(complain_values)):
    plt.text(r1[i], churn_0[i] + 20, str(churn_0[i]), ha='center', va='bottom', color='black', fontweight='bold')
    plt.text(r2[i], churn_1[i] + 50, str(churn_1[i]), ha='center', va='bottom', color='black', fontweight='bold')

# Labels & Titles
plt.xlabel('Complain', fontweight='bold')
plt.xticks([r + bar_width/2 for r in range(len(complain_values))], complain_labels)
plt.ylabel('No. of Customers', fontweight='bold')
plt.title('Complaint Counts of Customers in a Company', fontweight='bold')

# Legend
plt.legend()

# Show plot
plt.show()
No description has been provided for this image
In [43]:
# Import all the libraries for machine learning models
!pip install --upgrade scikit-learn

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report,precision_score,recall_score,f1_score
import category_encoders as ce
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder
Requirement already satisfied: scikit-learn in c:\users\rutvi\anaconda3\lib\site-packages (1.6.1)
Requirement already satisfied: numpy>=1.19.5 in c:\users\rutvi\anaconda3\lib\site-packages (from scikit-learn) (1.26.4)
Requirement already satisfied: scipy>=1.6.0 in c:\users\rutvi\anaconda3\lib\site-packages (from scikit-learn) (1.11.4)
Requirement already satisfied: joblib>=1.2.0 in c:\users\rutvi\anaconda3\lib\site-packages (from scikit-learn) (1.2.0)
Requirement already satisfied: threadpoolctl>=3.1.0 in c:\users\rutvi\anaconda3\lib\site-packages (from scikit-learn) (3.5.0)
In [44]:
ecd_clean.nunique()
ecd_clean.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5630 entries, 0 to 5629
Data columns (total 21 columns):
 #   Column                       Non-Null Count  Dtype   
---  ------                       --------------  -----   
 0   CustomerID                   5630 non-null   int64   
 1   Churn                        5630 non-null   int64   
 2   Tenure                       5630 non-null   float64 
 3   PreferredLoginDevice         5630 non-null   object  
 4   CityTier                     5630 non-null   int64   
 5   WarehouseToHome              5630 non-null   float64 
 6   PreferredPaymentMode         5630 non-null   object  
 7   Gender                       5630 non-null   object  
 8   HourSpendOnApp               5630 non-null   float64 
 9   NumberOfDeviceRegistered     5630 non-null   int64   
 10  PreferedOrderCat             5630 non-null   object  
 11  SatisfactionScore            5630 non-null   int64   
 12  MaritalStatus                5630 non-null   object  
 13  NumberOfAddress              5630 non-null   int64   
 14  Complain                     5630 non-null   int64   
 15  OrderAmountHikeFromlastYear  5630 non-null   float64 
 16  CouponUsed                   5630 non-null   float64 
 17  OrderCount                   5630 non-null   float64 
 18  DaySinceLastOrder            5630 non-null   float64 
 19  CashbackAmount               5630 non-null   int64   
 20  TenureGroup                  5630 non-null   category
dtypes: category(1), float64(7), int64(8), object(5)
memory usage: 885.7+ KB
In [45]:
#converting the category dtype of TenureGroup column to numerical value
encoder = ce.OrdinalEncoder(mapping=[{'col': 'TenureGroup', 'mapping': {'0-10 years': 1, '11-20 years': 2, '21-30 years':3,
                                                                       '31-40 years': 4, '41-50 years': 5, '51-60 years': 6,
                                                                       '61 years & above': 7}}])

encoder.fit(ecd_clean)
ecd_clean = encoder.transform(ecd_clean)
ecd_clean.head(10)
Out[45]:
CustomerID Churn Tenure PreferredLoginDevice CityTier WarehouseToHome PreferredPaymentMode Gender HourSpendOnApp NumberOfDeviceRegistered ... SatisfactionScore MaritalStatus NumberOfAddress Complain OrderAmountHikeFromlastYear CouponUsed OrderCount DaySinceLastOrder CashbackAmount TenureGroup
0 50001 1 4.00 Mobile Phone 3 6.00 Debit Card Female 3.00 3 ... 2 Single 9 1 11.00 1.00 1.00 5.00 160 1
1 50002 1 0.00 Mobile Phone 1 8.00 UPI Male 3.00 4 ... 3 Single 7 1 15.00 0.00 1.00 0.00 121 1
2 50003 1 0.00 Mobile Phone 1 30.00 Debit Card Male 2.00 4 ... 3 Single 6 1 14.00 0.00 1.00 3.00 120 1
3 50004 1 0.00 Mobile Phone 3 15.00 Debit Card Male 2.00 4 ... 5 Single 8 0 23.00 0.00 1.00 3.00 134 1
4 50005 1 0.00 Mobile Phone 1 12.00 Credit Card Male 0.00 3 ... 5 Single 3 0 11.00 1.00 1.00 3.00 130 1
5 50006 1 0.00 Computer 1 22.00 Debit Card Female 3.00 5 ... 5 Single 2 1 22.00 4.00 6.00 7.00 139 1
6 50007 1 0.00 Mobile Phone 3 11.00 Cash on Delivery Male 2.00 3 ... 2 Divorced 4 0 14.00 0.00 1.00 0.00 121 1
7 50008 1 0.00 Mobile Phone 1 6.00 Credit Card Male 3.00 3 ... 2 Divorced 3 1 16.00 2.00 2.00 0.00 123 1
8 50009 1 13.00 Mobile Phone 3 9.00 E wallet Male 0.00 4 ... 3 Divorced 2 1 14.00 0.00 1.00 2.00 127 2
9 50010 1 0.00 Mobile Phone 1 31.00 Debit Card Male 2.00 5 ... 3 Single 2 0 12.00 1.00 1.00 1.00 123 1

10 rows × 21 columns

In [46]:
#importing libraries for preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

#defining coloumns (target column, needed for encoding later)
cust_id = ['CustomerID']
Target = ["Churn"]

#creating categorical list columns with unique values less than 7
cat_variable = ecd_clean.nunique()[ecd_clean.nunique() < 7].keys().tolist()
cat_variable = [x for x in cat_variable if x not in Target]

#creating numnerical list columns
numerical_vari = [x for x in ecd_clean if x not in cat_variable + Target + cust_id]

#separating columns with only 2 unique values as binary
binary_vari = ecd_clean.nunique()[ecd_clean.nunique() == 2].keys().tolist()

#separating rest into multiple
more_then_2_vari = [i for i in cat_variable if i not in binary_vari]
In [47]:
#label encoding binary columns
lab_encod = LabelEncoder()
for x in binary_vari:
    ecd_clean[x] = lab_encod.fit_transform(ecd_clean[x])
In [48]:
#label encoding binary columns
lab_encod = LabelEncoder()
for x in binary_vari:
    ecd_clean[x] = lab_encod.fit_transform(ecd_clean[x])
In [49]:
#summary
ecd_clean.describe().transpose()
Out[49]:
count mean std min 25% 50% 75% max
CustomerID 5,630.00 52,815.50 1,625.39 50,001.00 51,408.25 52,815.50 54,222.75 55,630.00
Churn 5,630.00 0.17 0.37 0.00 0.00 0.00 0.00 1.00
Tenure 5,630.00 9.71 8.63 0.00 1.00 8.00 15.00 61.00
PreferredLoginDevice 5,630.00 0.71 0.45 0.00 0.00 1.00 1.00 1.00
CityTier 5,630.00 1.65 0.92 1.00 1.00 1.00 3.00 3.00
WarehouseToHome 5,630.00 14.94 8.94 0.00 9.00 13.00 20.00 127.00
Gender 5,630.00 0.60 0.49 0.00 0.00 1.00 1.00 1.00
HourSpendOnApp 5,630.00 2.80 0.93 0.00 2.00 3.00 3.00 5.00
NumberOfDeviceRegistered 5,630.00 3.69 1.02 1.00 3.00 4.00 4.00 6.00
SatisfactionScore 5,630.00 3.07 1.38 1.00 2.00 3.00 4.00 5.00
NumberOfAddress 5,630.00 4.21 2.58 1.00 2.00 3.00 6.00 22.00
Complain 5,630.00 0.28 0.45 0.00 0.00 0.00 1.00 1.00
OrderAmountHikeFromlastYear 5,630.00 14.97 4.89 0.00 13.00 14.00 18.00 26.00
CouponUsed 5,630.00 1.67 1.89 0.00 1.00 1.00 2.00 16.00
OrderCount 5,630.00 2.87 2.94 0.00 1.00 2.00 3.00 16.00
DaySinceLastOrder 5,630.00 4.30 3.70 0.00 1.00 3.00 7.00 46.00
CashbackAmount 5,630.00 177.22 49.19 0.00 146.00 163.00 196.00 325.00
TenureGroup 5,630.00 1.55 0.75 1.00 1.00 1.00 2.00 7.00
In [50]:
import plotly.graph_objects as go

# Sample dataset columns with numeric variables
numeric_columns = [
    "Churn", "Tenure", "CityTier", "WarehouseToHome", "HourSpendOnApp", 
    "NumberOfDeviceRegistered", "SatisfactionScore", "NumberOfAddress", 
    "Complain", "OrderAmountHikeFromlastYear", "CouponUsed", "OrderCount", 
    "DaySinceLastOrder", "CashbackAmount"
]

# Generating random data for correlation (Replace this with your actual DataFrame)
np.random.seed(42)
df = pd.DataFrame(np.random.rand(5630, len(numeric_columns)), columns=numeric_columns)

# Compute correlation matrix
corr_matrix = df.corr().values

# Create heatmap
trace = go.Heatmap(
    z=corr_matrix,
    x=numeric_columns,
    y=numeric_columns,
    colorscale="Viridis",
    colorbar=dict(title="Pearson Correlation Coefficient", titleside="right")
)

# Layout settings
layout = go.Layout(
    title="Correlation Matrix for Variables",
    autosize=False,
    height=720,
    width=800,
    margin=dict(r=0, l=210, t=25, b=210),
    yaxis=dict(tickfont=dict(size=9)),
    xaxis=dict(tickfont=dict(size=9))
)

# Plot figure
fig = go.Figure(data=[trace], layout=layout)
fig.show()
In [51]:
# Splitting dataset into train and test sets
train, test = train_test_split(ecd_clean, test_size=0.20, random_state=0)

# Define the columns excluding CustomerID and Target variables
cust_id = ["CustomerID"]  # Assuming 'CustomerID' is the identifier
Target = ["Churn"]  # Assuming 'Churn' is the target variable

cols = [col for col in ecd_clean.columns if col not in cust_id + Target]

# Splitting into features (X) and target (Y)
X_train = train[cols]
Y_train = train[Target]
X_test = test[cols]
Y_test = test[Target]
In [52]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# Identify categorical columns
categorical_columns = ["PreferredPaymentMode", "Gender", "MaritalStatus", "PreferredLoginDevice", "PreferedOrderCat"]

# Encode categorical variables using Label Encoding
label_encoders = {}
for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    ecd_clean[col] = label_encoders[col].fit_transform(ecd_clean[col])

# Define Customer ID and Target variable
cust_id = ["CustomerID"]  # Assuming 'CustomerID' is just an identifier
Target = ["Churn"]  # Assuming 'Churn' is the target variable

# Select feature columns (excluding CustomerID and Target)
cols = [col for col in ecd_clean.columns if col not in cust_id + Target]

# Split dataset into training and testing sets
train, test = train_test_split(ecd_clean, test_size=0.20, random_state=0)
X_train = train[cols]
Y_train = train[Target]
X_test = test[cols]
Y_test = test[Target]

# Train Logistic Regression Model
logistic_regression_model = LogisticRegression(random_state=0)
logistic_regression_model.fit(X_train, Y_train)

# Make predictions
Y_pred = logistic_regression_model.predict(X_test)

# Evaluate Model Performance
accuracy = accuracy_score(Y_test, Y_pred)
precision = precision_score(Y_test, Y_pred)
recall = recall_score(Y_test, Y_pred)
f1 = f1_score(Y_test, Y_pred)

# Print Metrics
print(f"\nModel Performance Metrics:")
print(f"---------------------------------")
print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}\n")

# Generate and Visualize the Confusion Matrix
conf_matrix = confusion_matrix(Y_test, Y_pred)

df_cm = pd.DataFrame(conf_matrix, index=["Actual: No Churn", "Actual: Churn"], 
                     columns=["Predicted: No Churn", "Predicted: Churn"])

plt.figure(figsize=(8, 6))
sns.heatmap(df_cm, annot=True, fmt='d', cmap='coolwarm', linewidths=1, cbar=True)
plt.title("Confusion Matrix")
plt.ylabel("True Label")
plt.xlabel("Predicted Label")
plt.show()

# Print Test Data Accuracy separately
print(f"Test Data Accuracy: {accuracy:.6f}")
Model Performance Metrics:
---------------------------------
Accuracy:  0.8694
Precision: 0.6639
Recall:    0.4332
F1 Score:  0.5243

No description has been provided for this image
Test Data Accuracy: 0.869449
In [53]:
# Generate a classification report
classification_report_result = classification_report(Y_test, Y_pred)
print("Classification Report:")
print(classification_report_result)
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.96      0.92       939
           1       0.66      0.43      0.52       187

    accuracy                           0.87      1126
   macro avg       0.78      0.69      0.72      1126
weighted avg       0.86      0.87      0.86      1126

Decision Tree Classifier and Random Forest Classifier

In [54]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Prepare data
X = ecd_clean.drop(['Churn'], axis=1)
y = ecd_clean['Churn']

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=128)

# Train Decision Tree Classifier
deci_tree = DecisionTreeClassifier(random_state=42)
deci_tree.fit(X_train, y_train)

# Make predictions
deci_tree_pred = deci_tree.predict(X_test)

# Calculate evaluation metrics
accuracy_dt = accuracy_score(y_test, deci_tree_pred)
precision_dt = precision_score(y_test, deci_tree_pred)
recall_dt = recall_score(y_test, deci_tree_pred)

# Print performance metrics
print("\nDecision Tree Model Performance:")
print("---------------------------------")
print(f"Accuracy:  {accuracy_dt:.4f}")
print(f"Precision: {precision_dt:.4f}")
print(f"Recall:    {recall_dt:.4f}\n")

# Generate confusion matrix
conf_matrix = confusion_matrix(y_test, deci_tree_pred)

# Visualizing confusion matrix with "YlGnBu" colormap
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="YlGnBu", linewidths=1, cbar=True)

# Labels and title
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix (Decision Tree)")
plt.show()
Decision Tree Model Performance:
---------------------------------
Accuracy:  0.9440
Precision: 0.8208
Recall:    0.8744

No description has been provided for this image
In [55]:
#classification report for decision tree
class_report = classification_report(y_test, deci_tree_pred)
print("Classification Report for Decision Tree:\n", class_report)
Classification Report for Decision Tree:
               precision    recall  f1-score   support

           0       0.97      0.96      0.97       927
           1       0.82      0.87      0.85       199

    accuracy                           0.94      1126
   macro avg       0.90      0.92      0.91      1126
weighted avg       0.95      0.94      0.94      1126

In [56]:
!pip install rfpimp
from rfpimp import * 
Requirement already satisfied: rfpimp in c:\users\rutvi\anaconda3\lib\site-packages (1.3.7)
Requirement already satisfied: numpy in c:\users\rutvi\anaconda3\lib\site-packages (from rfpimp) (1.26.4)
Requirement already satisfied: pandas in c:\users\rutvi\anaconda3\lib\site-packages (from rfpimp) (2.1.4)
Requirement already satisfied: scikit-learn in c:\users\rutvi\anaconda3\lib\site-packages (from rfpimp) (1.6.1)
Requirement already satisfied: matplotlib in c:\users\rutvi\anaconda3\lib\site-packages (from rfpimp) (3.8.0)
Requirement already satisfied: contourpy>=1.0.1 in c:\users\rutvi\anaconda3\lib\site-packages (from matplotlib->rfpimp) (1.2.0)
Requirement already satisfied: cycler>=0.10 in c:\users\rutvi\anaconda3\lib\site-packages (from matplotlib->rfpimp) (0.11.0)
Requirement already satisfied: fonttools>=4.22.0 in c:\users\rutvi\anaconda3\lib\site-packages (from matplotlib->rfpimp) (4.25.0)
Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\rutvi\anaconda3\lib\site-packages (from matplotlib->rfpimp) (1.4.4)
Requirement already satisfied: packaging>=20.0 in c:\users\rutvi\anaconda3\lib\site-packages (from matplotlib->rfpimp) (23.1)
Requirement already satisfied: pillow>=6.2.0 in c:\users\rutvi\anaconda3\lib\site-packages (from matplotlib->rfpimp) (10.2.0)
Requirement already satisfied: pyparsing>=2.3.1 in c:\users\rutvi\anaconda3\lib\site-packages (from matplotlib->rfpimp) (3.0.9)
Requirement already satisfied: python-dateutil>=2.7 in c:\users\rutvi\anaconda3\lib\site-packages (from matplotlib->rfpimp) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in c:\users\rutvi\anaconda3\lib\site-packages (from pandas->rfpimp) (2023.3.post1)
Requirement already satisfied: tzdata>=2022.1 in c:\users\rutvi\anaconda3\lib\site-packages (from pandas->rfpimp) (2023.3)
Requirement already satisfied: scipy>=1.6.0 in c:\users\rutvi\anaconda3\lib\site-packages (from scikit-learn->rfpimp) (1.11.4)
Requirement already satisfied: joblib>=1.2.0 in c:\users\rutvi\anaconda3\lib\site-packages (from scikit-learn->rfpimp) (1.2.0)
Requirement already satisfied: threadpoolctl>=3.1.0 in c:\users\rutvi\anaconda3\lib\site-packages (from scikit-learn->rfpimp) (3.5.0)
Requirement already satisfied: six>=1.5 in c:\users\rutvi\anaconda3\lib\site-packages (from python-dateutil>=2.7->matplotlib->rfpimp) (1.16.0)
In [57]:
import matplotlib.pyplot as plt

# Generating feature importance
feature_importance1 = deci_tree.feature_importances_
feature_names2 = X.columns

# Create a bar plot to visualize feature importance with customized colors
plt.figure(figsize=(10, 6))
plt.barh(feature_names2, feature_importance1, color='teal')  # Change the color to 'teal' or any color you like
plt.xlabel('Feature Importance', fontsize=12)
plt.ylabel('Features', fontsize=12)
plt.title('Decision Tree Feature Importance', fontsize=14)

# Add gridlines for better readability
plt.grid(axis='x', linestyle='--', alpha=0.7)

# Display the plot
plt.show()
No description has been provided for this image
In [58]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Building our model using Random Forest Classifier
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=324)

rand_fcl = RandomForestClassifier(n_estimators=120)
rand_fcl.fit(X_train, y_train)

rand_fcl_pred = rand_fcl.predict(X_test)

# Evaluate the model's performance
accuracy_rfc = accuracy_score(y_test, rand_fcl_pred)
precision_rfc = precision_score(y_test, rand_fcl_pred)
recall_rfc = recall_score(y_test, rand_fcl_pred)

print("Accuracy:", accuracy_rfc)
print("Precision:", precision_rfc)
print("Recall:", recall_rfc)

# Create a confusion matrix
conf_matrix = confusion_matrix(y_test, rand_fcl_pred)
plt.figure(figsize=(8, 6))

# Using a lighter color palette like 'Blues' for the heatmap
sns.heatmap(conf_matrix, annot=True, cmap="Blues", fmt="d", cbar=False)

plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix Random Forest Classifier")
plt.show()
Accuracy: 0.9706927175843695
Precision: 0.9649122807017544
Recall: 0.859375
No description has been provided for this image
In [59]:
import matplotlib.pyplot as plt

# Generate feature importance
feature_importance2 = rand_fcl.feature_importances_
feature_names2 = X.columns

# Create a histogram-like bar plot to visualize feature importance with customized colors
plt.figure(figsize=(10, 6))
plt.barh(feature_names2, feature_importance2, color='lightcoral')  # Light coral color for the bars
plt.xlabel('Feature Importance', fontsize=12)
plt.ylabel('Features', fontsize=12)
plt.title('Random Forest Feature Importance', fontsize=14)

# Add gridlines for better readability
plt.grid(axis='x', linestyle='--', alpha=0.7)

# Display the plot
plt.show()
No description has been provided for this image
In [ ]: